פרק 7 - ggplot

# install the library ggplot2 if it is not already installed
if (!require("ggplot2")) install.packages("ggplot2")
## Loading required package: ggplot2
# load the library ggplot2
library("ggplot2")

הורד את הקובץ songs.csv

if (!require("ggplot2")) install.packages('ggplot2') # for data visualization
if (!require("tidyverse")) install.packages('tidyverse') # dealing with dataframes
if (!require("tidylog")) install.packages('tidylog') # logs for tidyverse
if (!require("plotrix")) install.packages('plotrix') # for the standard error function

songsDataset <- read.csv('songs.csv') # read the dataset CSV file 
songsDataset <- na.omit(songsDataset) # to remove NA values if there are (makes no change)

# new categorical column "explicit_text" to translate the binary column "explicit"
songsDataset <- songsDataset %>% 
  mutate(explicit_text = case_when(
    explicit == 0 ~ "Implicit",
    explicit == 1 ~ "Explicit",
  ))

# summary table that holds descriptive statistics about the variable popularity:
# mean, sample size, standard deviation, standard error of the sample and median
# for both groups - Explicit songs and Implicit songs
stats_popularity_per_type <- songsDataset %>% 
  group_by(explicit_text) %>% 
  summarise(popularity_mean = mean(popularity), 
            n=n(),
            std = sd(popularity),
            sterr = std.error(popularity),
            median = median(popularity))

# simultanious qqplot for the variable popularity in both Implicit & Explicit songs
ggplot(songsDataset) + 
  geom_qq(aes(sample = popularity, color=explicit_text), size=1) +
  geom_qq_line(aes(sample = popularity)) +
  facet_wrap(~explicit_text, ncol = 6, shrink = TRUE) +
  guides(color='none') +
  labs(x='Theoretical Z score', y='Popularity',
       title = 'QQplot for the variable popularity in both Implicit & Explicit songs',
       subtitle = 'To check the noramality assumption on each group') +
  theme(plot.title = element_text(color="grey20",size=16, face="bold.italic"),
    plot.subtitle = element_text(color="grey20", face="italic"))

ggplot(songsDataset, aes(x=popularity, fill = explicit_text)) +
  geom_density(alpha=0.5) +
  geom_vline(data = stats_popularity_per_type, 
             aes(xintercept = popularity_mean), linetype="dashed") +
  geom_text(data = stats_popularity_per_type, 
            aes(x = 87.5, y = 0.03, label = paste('N:', n), color = explicit_text), 
            size = 4) + 
  geom_text(data = stats_popularity_per_type, 
            aes(x = 87.5, y = 0.025, label = paste('Mean:',round(popularity_mean,2)), color = explicit_text), 
            size = 4) +
  geom_text(data = stats_popularity_per_type, 
            aes(x = 87.5, y = 0.02, label = paste('Median:',round(median,2)), color = explicit_text), 
            size = 4) +
  geom_text(data = stats_popularity_per_type, 
            aes(x = 87.5, y = 0.015, label = paste('Std:',round(std,2)), color = explicit_text), 
            size = 4) +  
  geom_text(data = stats_popularity_per_type, 
            aes(x = 87.5, y = 0.01, label = paste('Sterr:',round(sterr,2)), color = explicit_text), 
            size = 4) + 
  facet_wrap(~explicit_text, ncol=1) +
  guides(color='none', fill='none') +
  scale_fill_manual(values = c('cornflowerblue', 'darkgoldenrod')) +
  scale_color_manual(values = c('cornflowerblue', 'darkgoldenrod')) +
  labs(title = 'Is there a difference in popularity between Explicit and Implicit songs?',
       subtitle = 'each vertical line represents the sample mean',
       y='Density',x='Popularity grade') +
  theme(plot.title = element_text(color="grey25",size=16, face="bold.italic"),
        plot.subtitle = element_text(color="grey25", face="italic"))